The goals of this workshop include: - Learn about data wrangling principles (e.g., tidy data) - Import, pre-process & visualize data using R - Share recommended readings, activities for continued learning
Click Here for Workshop Slides
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
Note that echo = FALSE, when added to the code chunk, prevents printing of the R code that generated the output.
library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ dplyr 1.0.5
## ✓ tibble 3.1.3 ✓ stringr 1.4.0
## ✓ tidyr 1.1.3 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
“These Community Mobility Reports aim to provide insights into what has changed in response to policies aimed at combating COVID-19. The reports chart movement trends over time by geography, across different categories of places such as retail and recreation, groceries and pharmacies, parks, transit stations, workplaces, and residential.”
# read in the Google Mobility dataset
google_mobility <- read_csv("https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv")
## Rows: 6969905 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): country_region_code, country_region, sub_region_1, sub_region_2, m...
## dbl (6): retail_and_recreation_percent_change_from_baseline, grocery_and_ph...
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#google_mobility_byregion = "https://www.gstatic.com/covid19/mobility/Region_Mobility_Report_CSVs.zip"
R, as well as some packages, have built in datasets. They can be called with the data() function (e.g., data(state)). We will leverage a built in dataset with state metadata.
data(state)
state_lookup = tibble(sub_region_1 = state.name,
us_division = state.division,
us_region = state.region,
state_land_area = state.area)
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# filter for US
google_mobility_US <- google_mobility %>%
filter(country_region == "United States") %>%
mutate(dt_date = date) %>%
mutate(dt_month = lubridate::month(dt_date),
dt_month_ = lubridate::month(dt_date, label=T),
dt_year= lubridate::year(dt_date),
dt_day = lubridate::day(dt_date),
dt_wkday_l = lubridate::wday(dt_date, label=T),
dt_wkday_n = lubridate::wday(dt_date)) %>%
mutate(dt_weekend = ifelse(dt_wkday_l %in% c("Sat", "Sun"), TRUE, FALSE))
google_mobility_US_w_lu <- google_mobility_US %>%
inner_join(state_lookup)
## Joining, by = "sub_region_1"
#skimr::skim(google_mobility_US)
# filter with specific entries
google_mobility_US_NY <- google_mobility_US %>%
filter(sub_region_1 == "New York")
google_mobility_US_FL <- google_mobility_US %>%
filter(sub_region_1 == "Florida")
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# filter to a 'vector' of entries
google_mobility_US_targeted <- google_mobility_US %>%
filter(sub_region_1 %in% c("California", "New York", "Florida"))
# base R approach
table(google_mobility_US$sub_region_1)
##
## Alabama Alaska Arizona
## 36832 5860 9131
## Arkansas California Colorado
## 36147 32532 25976
## Connecticut Delaware District of Columbia
## 5247 2332 583
## Florida Georgia Hawaii
## 38309 78125 2915
## Idaho Illinois Indiana
## 18822 51305 51353
## Iowa Kansas Kentucky
## 47985 31176 53927
## Louisiana Maine Maryland
## 33189 9651 14420
## Massachusetts Michigan Minnesota
## 8439 44308 42804
## Mississippi Missouri Montana
## 39079 53834 15249
## Nebraska Nevada New Hampshire
## 25598 7253 6388
## New Jersey New Mexico New York
## 12818 16626 35884
## North Carolina North Dakota Ohio
## 55228 10965 50615
## Oklahoma Oregon Pennsylvania
## 35647 19260 37870
## Rhode Island South Carolina South Dakota
## 3489 26518 14735
## Tennessee Texas Utah
## 50368 110216 13291
## Vermont Virginia Washington
## 7904 69502 20495
## West Virginia Wisconsin Wyoming
## 24228 38884 11003
# number of records by state
records_per_state = google_mobility_US %>%
group_by(sub_region_1) %>%
summarise(n_records = n()) %>%
arrange(sub_region_1)
records_per_state
## # A tibble: 52 × 2
## sub_region_1 n_records
## <chr> <int>
## 1 Alabama 36832
## 2 Alaska 5860
## 3 Arizona 9131
## 4 Arkansas 36147
## 5 California 32532
## 6 Colorado 25976
## 7 Connecticut 5247
## 8 Delaware 2332
## 9 District of Columbia 583
## 10 Florida 38309
## # … with 42 more rows
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# group data by weekday
metrics_month_weekend_US <- google_mobility_US %>%
group_by(sub_region_1, dt_year, dt_month_, dt_wkday_l) %>%
summarise(avg_change_parks = mean(parks_percent_change_from_baseline, na.rm=T),
avg_change_retail_rec = mean(retail_and_recreation_percent_change_from_baseline, na.rm=T),
avg_change_workplaces = mean(workplaces_percent_change_from_baseline, na.rm=T)) %>%
inner_join(state_lookup)
## `summarise()` has grouped output by 'sub_region_1', 'dt_year', 'dt_month_'. You can override using the `.groups` argument.
## Joining, by = "sub_region_1"
# plot summary data across all states
allUS_retail = ggplot(metrics_month_weekend_US %>% filter(us_region %in% c("South", "West")), aes(sub_region_1, avg_change_retail_rec)) +
geom_boxplot() +
theme_minimal() +
theme(axis.text.x=element_text(angle=90)) +
facet_wrap(dt_year~.) +
labs(x="Month", y="Retail (% Change)")
allUS_parks = ggplot(metrics_month_weekend_US %>% filter(us_region %in% c("South", "West")), aes(sub_region_1, avg_change_parks)) +
geom_boxplot() +
theme_minimal() +
facet_wrap(dt_year~.) +
theme(axis.text.x=element_text(angle=90)) +
labs(x="Month", y="Parks (% Change)")
allUS_retail
allUS_parks
jitter_retail <- ggplot(metrics_month_weekend_US, aes(dt_month_,
avg_change_retail_rec,
group=us_region,
color=us_region)) +
#geom_point() +
geom_jitter() +
theme_minimal() +
labs(x="Month", y="Retail & Recreation (% Change)")
smooth_retail = ggplot(metrics_month_weekend_US, aes(dt_month_,
avg_change_retail_rec,
group=us_region,
color=us_region)) +
#geom_point() +
#geom_jitter() +
geom_smooth() +
theme_minimal() +
labs(x="Month", y="Retail & Recreation (% Change)")
smooth_parks = ggplot(metrics_month_weekend_US, aes(dt_month_,
avg_change_parks,
group=us_region,
color=us_region)) +
#geom_point() +
#geom_jitter() +
geom_smooth() +
theme_minimal() +
labs(x="Month", y="Parks (% Change)")
jitter_retail
smooth_retail
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
smooth_parks
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
library(cowplot)
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
# create a plot of weekly trends
early_covid_retail = ggplot(google_mobility_US_targeted %>%
filter(dt_month %in% c(1,2,3,4,5)) %>%
filter(dt_year == 2020), aes(dt_date,
retail_and_recreation_percent_change_from_baseline,
color=sub_region_1)) +
#geom_point() +
geom_jitter() +
theme_minimal() +
facet_wrap(.~sub_region_1) +
labs(x="Month", y="Retal & Rec (% Change)", title="2020")
early_covid_parks = ggplot(google_mobility_US_targeted %>%
filter(dt_month %in% c(1,2,3,4,5)) %>%
filter(dt_year == 2020), aes(dt_date,
parks_percent_change_from_baseline,
color=sub_region_1)) +
#geom_point() +
geom_jitter() +
theme_minimal() +
facet_wrap(.~sub_region_1) +
labs(x="Month", y="Parks (% Change)", title="2020")
nxtyear_covid_retail = ggplot(google_mobility_US_targeted %>%
filter(dt_month %in% c(1,2,3,4,5)) %>%
filter(dt_year == 2021), aes(dt_date,
retail_and_recreation_percent_change_from_baseline,
color=sub_region_1)) +
#geom_point() +
geom_jitter() +
theme_minimal() +
facet_wrap(.~sub_region_1) +
labs(x="Month", y="Retal & Rec (% Change)", title="2021")
cowplot::plot_grid(early_covid_retail, early_covid_parks, ncol=1)
## Warning: Removed 2050 rows containing missing values (geom_point).
## Warning: Removed 7320 rows containing missing values (geom_point).
cowplot::plot_grid(early_covid_retail, nxtyear_covid_retail, ncol=1)
## Warning: Removed 2050 rows containing missing values (geom_point).
## Warning: Removed 2915 rows containing missing values (geom_point).
cowplot::plot_grid(allUS_retail, allUS_parks, ncol=1)
cowplot::plot_grid(smooth_retail, smooth_parks, ncol=1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# Variable names -----
# retail_and_recreation_percent_change_from_baseline = col_double(),
# grocery_and_pharmacy_percent_change_from_baseline = col_double(),
# parks_percent_change_from_baseline = col_double(),
# transit_stations_percent_change_from_baseline = col_double(),
# workplaces_percent_change_from_baseline = col_double(),
# residential_percent_change_from_baseline = col_double()
early_covid_period_data = google_mobility_US_w_lu %>%
filter(dt_month %in% c(1,2,3,4,5,6)) %>%
filter(dt_year == 2020) %>%
filter(sub_region_1 %in% c("New York", "California", "Florida", "Wyoming", "Texas", "Arizona","Montana", "South Dakota", "Nevada"))
ggplot(early_covid_period_data, aes(dt_date,
parks_percent_change_from_baseline,
color=sub_region_1)) +
#geom_point() +
theme_void() +
geom_jitter() +
facet_wrap(.~sub_region_1) +
theme(legend.position = "none")
## Warning: Removed 40493 rows containing missing values (geom_point).
ggplot(early_covid_period_data, aes(dt_date,
parks_percent_change_from_baseline,
color=sub_region_1)) +
geom_jitter() +
facet_wrap(.~sub_region_1)
## Warning: Removed 40493 rows containing missing values (geom_point).
ggplot(early_covid_period_data, aes(dt_date,
parks_percent_change_from_baseline,
color=sub_region_1)) +
#geom_point() +
theme_bw() +
geom_jitter() +
facet_wrap(.~sub_region_1)
## Warning: Removed 40493 rows containing missing values (geom_point).
ggplot(early_covid_period_data, aes(dt_date,
parks_percent_change_from_baseline,
color=sub_region_1)) +
#geom_point() +
theme_dark() +
geom_jitter() +
facet_wrap(.~sub_region_1)
## Warning: Removed 40493 rows containing missing values (geom_point).
ggplot(early_covid_period_data, aes(dt_date,
parks_percent_change_from_baseline,
color=us_region)) +
#geom_point() +
theme_bw() +
geom_jitter() +
facet_wrap(.~sub_region_1) +
labs(x= "Date", y="Retail & Recreation (% Change)")
## Warning: Removed 40493 rows containing missing values (geom_point).
ggplot(early_covid_period_data, aes(dt_date,
parks_percent_change_from_baseline,
color=us_region)) +
#geom_point() +
geom_jitter() +
facet_wrap(.~sub_region_1) +
labs(x= "Date", y="Retail & Recreation (% Change)") +
theme_bw(base_size = 16) +
theme(axis.text.x = element_text(angle=90))
## Warning: Removed 40493 rows containing missing values (geom_point).
fig1 <- ggplot(early_covid_period_data, aes(dt_date,
retail_and_recreation_percent_change_from_baseline,
color=us_region)) +
#geom_point() +
#geom_jitter() +
geom_smooth() +
facet_wrap(.~sub_region_1) +
labs(x= "Date", y="Retail & Recreation (% Change)",
#color="State",
tag = "Figure 1",
title = "Google Mobility: Change over Time",
subtitle = "i.e., Retail & Recreation, early COVID-19 period (01/20 - 06/20)",
caption = "Google Mobility: Retail & Recreation (% Change) over early COVID period (01/20 - 06/20)") +
theme_bw(base_size = 12, base_family = "Arial")
fig2 <- ggplot(early_covid_period_data, aes(dt_date,
parks_percent_change_from_baseline,
color=us_region)) +
#geom_point() +
#geom_jitter() +
geom_smooth() +
facet_wrap(.~sub_region_1) +
labs(x= "Date", y="Retail & Recreation (% Change)",
#color="State",
tag = "Figure 2",
title = "Google Mobility: Change over Time",
subtitle = "i.e., Parks, early COVID-19 period (01/20 - 06/20)",
caption = "Google Mobility: Parks (% Change) over early COVID period (01/20 - 06/20)") +
theme_bw(base_size = 12, base_family = "Arial")
fig1
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 16583 rows containing non-finite values (stat_smooth).
fig2
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 40493 rows containing non-finite values (stat_smooth).
largest_changes_parks_long <- google_mobility_US %>%
group_by(sub_region_1, dt_year) %>%
summarise(avg_change_parks = mean(parks_percent_change_from_baseline, na.rm=T)) %>%
arrange(sub_region_1)
## `summarise()` has grouped output by 'sub_region_1'. You can override using the `.groups` argument.
largest_changes_parks_wide <- largest_changes_parks_long %>%
#pivot_wider(id_cols=c())
pivot_wider(names_from = dt_year, values_from = avg_change_parks) %>%
mutate(change_2021_2020 = `2021` - `2020`)
top10_changes <- largest_changes_parks_wide %>%
arrange(-change_2021_2020) %>%
head(n=10)
bottom10_changes <- largest_changes_parks_wide %>%
arrange(change_2021_2020) %>%
head(n=10)
largest_changes_parks_long
## # A tibble: 104 × 3
## # Groups: sub_region_1 [52]
## sub_region_1 dt_year avg_change_parks
## <chr> <dbl> <dbl>
## 1 Alabama 2020 17.9
## 2 Alabama 2021 8.85
## 3 Alaska 2020 52.9
## 4 Alaska 2021 79.8
## 5 Arizona 2020 -1.97
## 6 Arizona 2021 -1.91
## 7 Arkansas 2020 48.2
## 8 Arkansas 2021 45.1
## 9 California 2020 6.17
## 10 California 2021 12.5
## # … with 94 more rows
largest_changes_parks_wide
## # A tibble: 52 × 4
## # Groups: sub_region_1 [52]
## sub_region_1 `2020` `2021` change_2021_2020
## <chr> <dbl> <dbl> <dbl>
## 1 Alabama 17.9 8.85 -9.04
## 2 Alaska 52.9 79.8 26.9
## 3 Arizona -1.97 -1.91 0.0582
## 4 Arkansas 48.2 45.1 -3.07
## 5 California 6.17 12.5 6.31
## 6 Colorado 23.2 20.3 -2.95
## 7 Connecticut 51.2 49.6 -1.63
## 8 Delaware 46.8 62.7 15.9
## 9 District of Columbia -34.8 -20.2 14.6
## 10 Florida -14.7 -8.17 6.57
## # … with 42 more rows
top10_changes
## # A tibble: 10 × 4
## # Groups: sub_region_1 [10]
## sub_region_1 `2020` `2021` change_2021_2020
## <chr> <dbl> <dbl> <dbl>
## 1 Alaska 52.9 79.8 26.9
## 2 Rhode Island 69.2 90.5 21.3
## 3 Maine 59.0 80.2 21.2
## 4 South Dakota 106. 126. 20.4
## 5 Hawaii -46.5 -28.8 17.7
## 6 Vermont 52.8 69.6 16.8
## 7 Montana 53.9 70.6 16.6
## 8 Wyoming 43.8 60.1 16.4
## 9 Delaware 46.8 62.7 15.9
## 10 Oregon 46.5 62.2 15.7
bottom10_changes
## # A tibble: 10 × 4
## # Groups: sub_region_1 [10]
## sub_region_1 `2020` `2021` change_2021_2020
## <chr> <dbl> <dbl> <dbl>
## 1 Mississippi 4.92 -6.71 -11.6
## 2 Kansas 67.3 57.5 -9.81
## 3 Alabama 17.9 8.85 -9.04
## 4 West Virginia 3.08 -4.78 -7.86
## 5 Massachusetts 59.3 51.4 -7.84
## 6 Louisiana -8.68 -16.4 -7.68
## 7 Oklahoma 10.4 3.33 -7.07
## 8 Utah 61.5 56.1 -5.45
## 9 Nebraska 64.3 59.6 -4.70
## 10 Indiana 46.3 42.4 -3.88